Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 30112
## Number of samples: 80 (45 ASD, 35 controls)

Keep only differentially expressed probes

plot_data = data.frame('id'=rownames(datExpr), 'mean_expression' = rowMeans(datExpr), 
                       'SFARI_score'=DE_info$`gene-score`!='None',
                       'DExpressed'=DE_info$padj<0.05 & abs(DE_info$log2FoldChange)>log2(1.2))
ggplotly(plot_data %>% ggplot(aes(x=mean_expression, fill=DExpressed, color=DExpressed)) + geom_density(alpha=0.3) + 
         scale_x_log10() + ggtitle('Probe Mean Expression distribution') + theme_minimal())

We lose almost all of the probes with SFARI score

plot_data_SFARI = plot_data %>% filter(SFARI_score)
ggplotly(plot_data_SFARI %>% ggplot(aes(x=mean_expression, fill=DExpressed, color=DExpressed)) + geom_density(alpha=0.3) + 
              ggtitle('Probe Mean Expression distribution for SFARI Genes') + theme_minimal())
print(paste0('Losing ', round(100*(1-mean(plot_data$DExpressed[plot_data$SFARI_score==TRUE])),1), 
             '% of the probes with SFARI score'))
## [1] "Losing 95.9% of the probes with SFARI score"
datExpr = datExpr[plot_data$DExpressed,]
DE_info = DE_info[plot_data$DExpressed,]
datProbes = datProbes[plot_data$DExpressed,]

rm(plot_data, plot_data_SFARI)

Dimensionality reduction using PCA

To make calculations more efficient, we can perform PCA and keep the first 19 principal components which explain over 98% of the total variance

pca = prcomp(datExpr)
datExpr_redDim = pca$x %>% data.frame %>% dplyr::select(PC1:PC19)

Clustering Methods

clusterings = list()

clusterings[['SFARI_score']] = DE_info$`gene-score`
names(clusterings[['SFARI_score']]) = rownames(DE_info)

clusterings[['SFARI_bool']] = DE_info$`gene-score`!='None'
names(clusterings[['SFARI_bool']]) = rownames(DE_info)

clusterings[['syndromic']] = DE_info$syndromic==1
names(clusterings[['syndromic']]) = rownames(DE_info)



K-means clustering

set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=200, nstart=25,
                                      algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster



Hierarchical Clustering

Chose k=16 as best number of clusters. SFARI genes seem to group in the last two clusters

h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
abline(h=19.5, col='blue')

best_k = 16

SFARI and Neuronal related probes don’t seem to concentrate anywhere specific. Could be because there aren’t enough left to see a pattern

clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(){
  min_score = clusterings[['SFARI_score']] %>% as.numeric %>% min(na.rm=TRUE)
  max_score = clusterings[['SFARI_score']] %>% as.numeric %>% max(na.rm=TRUE)
  viridis_score_cols = viridis(max_score - min_score + 1)
  names(viridis_score_cols) = seq(min_score, max_score)
  
  return(viridis_score_cols)
}

viridis_score_cols = create_viridis_dict()

dend_meta = DE_info[match(labels(h_clusts), DE_info$ID),] %>% 
            mutate('SFARI_score' = viridis_score_cols[`gene-score`],                     # Purple: 2, Yellow: 6
                   'SFARI_bool' = ifelse(`gene-score` != 'None', '#21908CFF', 'white'),  # Acqua
                   'Syndromic' = ifelse(syndromic == T, 'orange', 'white'),
                   'Neuronal' = ifelse(ID %in% GO_neuronal$ID, '#666666','white')) %>% 
            dplyr::select(SFARI_score, SFARI_bool, Syndromic, Neuronal)

h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>% 
             set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)



Consensus Clustering

Samples are grouped into two big clusters, and then many outliers.

*Output plots in folder

*The rest of the output plots can be found in the Data/Gandal/consensusClustering/probes_DE/l1 folder

Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance (keeping 98% of the variance)

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign genes to clusters with FDR<0.01 using the fdrtool package

ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F, verbose=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])

Leaves 73% of the probes without a cluster

ICA_clusters %>% rowSums %>% table
## .
##   0   1   2   3   6 
## 772 242  29  11   1
# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) + 
#   geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
#   theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

Trying again these time with all of the principal components and 40 clusters

ICA_output = pca$x %>% data.frame %>% runICA(nbComp=40, method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F, verbose=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

Doesn’t make a big difference, but it’s still better

ICA_clusters %>% rowSums %>% table
## .
##   0   1   2   3 
## 747 258  46   4
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])

# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) + 
#   geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
#   theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()



WGCNA

The soft R-squared value starts in 0.56 but then decreases and doesn’t go up until the end, Highest value does not reach threshold

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 30, by=1))
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      1  0.56800  1.3900          0.873   751.0     837.0    885
## 2      2  0.32000  0.6070          0.877   611.0     713.0    795
## 3      3  0.16700  0.3270          0.870   522.0     624.0    733
## 4      4  0.09170  0.2180          0.933   457.0     552.0    684
## 5      5  0.04370  0.1370          0.957   408.0     494.0    644
## 6      6  0.01240  0.0652          0.948   368.0     444.0    610
## 7      7  0.00105  0.0176          0.978   336.0     401.0    581
## 8      8  0.00153 -0.0203          0.947   308.0     364.0    555
## 9      9  0.01450 -0.0571          0.989   284.0     332.0    531
## 10    10  0.04330 -0.0973          0.979   264.0     303.0    510
## 11    11  0.07730 -0.1240          0.970   245.0     278.0    491
## 12    12  0.12800 -0.1500          0.951   229.0     255.0    473
## 13    13  0.15200 -0.1670          0.939   215.0     234.0    457
## 14    14  0.20900 -0.1960          0.896   202.0     216.0    442
## 15    15  0.25700 -0.2220          0.859   191.0     199.0    428
## 16    16  0.31500 -0.2450          0.792   180.0     183.0    414
## 17    17  0.37700 -0.2610          0.775   170.0     169.0    402
## 18    18  0.40100 -0.2770          0.761   161.0     157.0    390
## 19    19  0.45000 -0.2930          0.767   153.0     145.0    379
## 20    20  0.48800 -0.3110          0.773   146.0     135.0    369
## 21    21  0.53800 -0.3270          0.766   139.0     125.0    359
## 22    22  0.57500 -0.3420          0.766   132.0     116.0    350
## 23    23  0.59800 -0.3590          0.755   126.0     107.0    341
## 24    24  0.62300 -0.3700          0.759   121.0      99.8    332
## 25    25  0.64100 -0.3850          0.708   116.0      93.1    324
## 26    26  0.67100 -0.3950          0.729   111.0      87.1    316
## 27    27  0.69200 -0.4120          0.720   106.0      81.6    309
## 28    28  0.70500 -0.4250          0.711   102.0      76.2    302
## 29    29  0.67500 -0.4500          0.625    97.9      71.2    295
## 30    30  0.68700 -0.4670          0.630    94.1      66.5    288
network = datExpr_redDim %>% t %>% blockwiseModules(power=28, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 0, 3
##       ..there is nothing to merge.
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It leaves 172 genes without a cluster and classifies all the other genes in a single cluster

clusterings[['WGCNA']] %>% table
## .
##   0   1 
## 172 883



Gaussian Mixture Models with hard thresholding

The trajectory is a bit erratic. The lowest BIC is achieved at 20

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=40, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 20
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())



Manual Clustering

Separating the two clouds of points into two clusters

intercept=4.5
slope=-0.01
manual_clusters = as.factor(as.numeric(slope*datExpr_redDim$PC1 + intercept > datExpr_redDim$PC2))
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters

datExpr_redDim %>% ggplot(aes(PC1, PC2, color=manual_clusters)) + geom_point(alpha=0.3) + 
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              geom_abline(intercept=intercept, slope=slope, color='gray') +
              theme_minimal() + ggtitle('PCA')

clusterings[['Manual']] %>% table
## .
##    0    1 
##   34 1021
rm(intercept, slope, pca)

The blue cluster seems to have four gaussians in both Mean and SD and the salmon cluster has two in both.

manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd), 
                             manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)

Separate probes into four and two Gaussians, respectively by their mean expression:

gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

c1_mean = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(mean)
rownames(c1_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_mean = c1_mean %>% GMM(2)

c2_mean = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(mean)
rownames(c2_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_mean = c2_mean %>% GMM(4)

clusterings[['Manual_mean']] = as.character(clusterings[['Manual']])
clusterings[['Manual_mean']][clusterings[['Manual']]==0] = gmm_c1_mean$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_mean']][clusterings[['Manual']]==1] = gmm_c2_mean$Log_likelihood %>% 
  apply(1, function(x) glue('2_',which.max(x)))


plot_gaussians = manual_clusters_data %>% ggplot(aes(x=mean)) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[1],
                args=list(mean=gmm_c1_mean$centroids[1], sd=gmm_c1_mean$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[2],
                args=list(mean=gmm_c1_mean$centroids[2], sd=gmm_c1_mean$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[3],
                args=list(mean=gmm_c2_mean$centroids[1], sd=gmm_c2_mean$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[4],
                args=list(mean=gmm_c2_mean$centroids[2], sd=gmm_c2_mean$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[5],
                args=list(mean=gmm_c2_mean$centroids[3], sd=gmm_c2_mean$covariance_matrices[3])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(6)[6],
                args=list(mean=gmm_c2_mean$centroids[4], sd=gmm_c2_mean$covariance_matrices[4])) +
  theme_minimal()

plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_mean']]))) + 
          geom_point() + theme_minimal()

grid.arrange(plot_gaussians, plot_points, ncol=2)

clusterings[['Manual_mean']] %>% table
## .
## 1_1 1_2 2_1 2_2 2_3 2_4 
##  13  21 248 310 273 190

Separate clusters into two Gaussians per diagnosis by their sd:

n_clusters = 2

c1_sd = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(sd)
rownames(c1_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_sd = c1_sd %>% GMM(n_clusters)

c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(n_clusters)

clusterings[['Manual_sd']] = as.character(clusterings[['Manual']])
clusterings[['Manual_sd']][clusterings[['Manual']]==0] = gmm_c1_sd$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_sd']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>% 
  apply(1, function(x) glue('2_',which.max(x)))


plot_gaussians = manual_clusters_data %>% ggplot(aes(x=sd)) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], # 
                args=list(mean=gmm_c1_sd$centroids[1], sd=gmm_c1_sd$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], # 
                args=list(mean=gmm_c1_sd$centroids[2], sd=gmm_c1_sd$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], # 
                args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], # 
                args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
  theme_minimal()


plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_sd']]))) + 
          geom_point() + theme_minimal()

grid.arrange(plot_gaussians, plot_points, ncol=2)

clusterings[['Manual_sd']] %>% table
## .
## 1_1 1_2 2_1 2_2 
##  16  18 473 548
rm(c1_sd, c2_sd, gmm_c1_sd, gmm_c2_sd)
# Clean up the environment a bit
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, 
   best_power, c, manual_clusters, manual_clusters_data, c1_sd, c2_sd, c1_mean, c2_mean, 
   gmm_c1_sd, gmm_c2_sd,gmm_c1_sd, gmm_c1_mean, gmm_c2_mean, p1, p2, pca_data_projection, dend_meta, 
   plot_gaussians, plot_points, n_clusters, viridis_score_cols, gg_colour_hue, create_viridis_dict)

Compare clusterings

Using Adjusted Rand Index:

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, cluster_sim)

Scatter plots

  • The simple clustering methods only consider the 1st component, dividing by vertical lines

  • WGCNA doesn’t work well (classifies almost everything as a single class)

  • SFARI genes seem to be everywhere

  • 1st PC seems to reflect the average level of expression of the genes

  • There seems to be a change in behaviour around PC1=0 (CC and WGCNA)

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                                 k_means = as.factor(clusterings[['km']]),
                hc = as.factor(clusterings[['hc']]),                   cc = as.factor(clusterings[['cc_l1']]),
                ica = as.factor(clusterings[['ICA_min']]),
                n_ica = as.factor(rowSums(ICA_clusters)),              gmm = as.factor(clusterings[['GMM']]),
                wgcna = as.factor(clusterings[['WGCNA']]),             manual = as.factor(clusterings[['Manual']]),
                manual_mean = as.factor(clusterings[['Manual_mean']]), manual_sd = as.factor(clusterings[['Manual_sd']]),   
                SFARI = as.factor(clusterings[['SFARI_score']]),       SFARI_bool = as.factor(clusterings[['SFARI_bool']]),
                syndromic = as.factor(clusterings[['syndromic']])) %>%
              bind_cols(DE_info[DE_info$ID %in% rownames(datExpr_redDim),]) %>% 
              mutate(avg_expr = log2(rowMeans(datExpr)+1)[rownames(datExpr) %in% rownames(datExpr_redDim)])
rownames(plot_points) = plot_points$ID
selectable_scatter_plot(plot_points, plot_points)